# Import required libraries!
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

# load the dataset
crops = pd.read_csv("soil_measures.csv")
crops.head()

# check for missing values
crops.isna().sum()

N       0
P       0
K       0
ph      0
crop    0
dtype: int64

# Check how many crops we have, i.e., multi-class target
crops.crop.unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

# Split into feature and target sets
X = crops.drop(columns="crop")
y = crops["crop"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionary to store the model performance for each feature
feature_performance = {}


# for scaling the data
scaler = StandardScaler()

for feature in ["N", "P", "K", "ph"]:
    X_train_scaled = scaler.fit_transform(X_train[[feature]])
    X_test_scaled = scaler.transform(X_test[[feature]])
    
    log_reg = LogisticRegression(max_iter=500, solver="saga")  
    log_reg.fit(X_train_scaled, y_train)
    y_pred = log_reg.predict(X_test_scaled)

    f1 = metrics.f1_score(y_test, y_pred, average="weighted")
    feature_performance[feature] = f1

    print(f"F1-score for {feature}: {f1}")

F1-score for N: 0.10689900116508289
F1-score for P: 0.08426955444720075
F1-score for K: 0.13831456375684123
F1-score for ph: 0.045464856528065166

# K produced the best F1 score
best_feature = max(feature_performance, key=feature_performance.get)
print(f"\nBest feature: {best_feature} with F1-score: {feature_performance[best_feature]}")

Best feature: K with F1-score: 0.13831456375684123

from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train[["K"]], y_train)
y_pred_tree = tree_clf.predict(X_test[["K"]])

f1_tree = metrics.f1_score(y_test, y_pred_tree, average="weighted")
print(f"Decision Tree F1-score for K: {f1_tree}")

Decision Tree F1-score for K: 0.32406128337824247

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train[["K"]], y_train)
y_pred_rf = rf_clf.predict(X_test[["K"]])

f1_rf = metrics.f1_score(y_test, y_pred_rf, average="weighted")
print(f"Random Forest F1-score for K: {f1_rf}")

Random Forest F1-score for K: 0.30604986326799966

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

importances = rf_model.feature_importances_
feature_names = ["N", "P", "K", "ph"]

for name, importance in zip(feature_names, importances):
    print(f"Feature: {name}, Importance: {importance:.4f}")

Feature: N, Importance: 0.2143
Feature: P, Importance: 0.2530
Feature: K, Importance: 0.3241
Feature: ph, Importance: 0.2086

Project Description¶

Background & Details¶

Task¶

It looks like "K" (Potassium) is the best single feature for predicting the "crop" variable, but the F1-score (0.138) is still quite low. This suggests that using only one feature isn't very effective for this classification problem.¶

Try Other Models¶

Decision Tree Classifier¶

Random Forest Classifier (handles non-linearity better)¶

Check Feature Importance¶

	N	P	K	ph	crop
0	90	42	43	6.502985	rice
1	85	58	41	7.038096	rice
2	60	55	44	7.840207	rice
3	74	35	40	6.980401	rice
4	78	42	42	7.628473	rice